import pandas as pd
accessories = pd.read_csv('accessories.csv')
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
accessories = pd.read_csv('accessories.csv')
# Split the dataset
train_set_accessories, test_set_accessories = train_test_split(accessories, test_size=0.1, random_state=None)
# Display the number of rows in each set
print("Training set accessories size:", len(train_set_accessories))
print("Testing set accessories size:", len(test_set_accessories))
Training set accessories size: 5722 Testing set accessories size: 636
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_accessories, order=test_set_accessories['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 accessories Category Products ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()
# Get the counts of each subcategory
subcategory_counts = test_set_accessories['subcategory'].value_counts()
# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total accessories ',total_subcategories)
subcategory Chapeaux & Bonnets 106 Beanie Hat 78 Baseball Caps 60 Foulards & Écharpes 56 Bucket Hat 44 Flat Caps 33 Chaussettes 30 Chaussettes & Collants 26 Gants 26 Ceintures & Bretelles 20 Name: count, dtype: int64 Total accessories 636
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming 'data_cleaned' is your DataFrame
# Get the counts of each subcategory
subcategory_counts = test_set_accessories['subcategory'].value_counts()
# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_accessories['subcategory'],
order=bottom_10_subcategories.index,
palette='viridis')
plt.title('accessories Bottom 10 Product Categories ')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()
# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory Coques de téléphone 6 Tech Accessories 3 Boutons de manchettes 3 Sun Protection Sleeve 3 Ipad Case 2 Autres 2 Voiles 1 Favoris de mariage 1 Berets 1 Fleurs 1 Name: count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_accessories.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Subcategories with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Beanie Hat 11954 Foulards & Écharpes 8443 Chapeaux & Bonnets 6340 Chaussettes & Collants 4795 Chaussettes 4237 Baseball Caps 3328 Bucket Hat 3293 Gants 2657 Flat Caps 2289 Ceintures & Bretelles 2264 Name: likes_count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_accessories.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Subcategories with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Sun Protection Sleeve 10 Berets 12 Voiles 24 Ipad Case 40 Fleurs 48 Favoris de mariage 128 Tech Accessories 220 Cache-oreilles & Masques 300 Headband & Bandeaux 310 Autres 365 Name: likes_count, dtype: int64
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_accessories['current_price'], kde=True, bins=15, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 8))
sns.scatterplot(x='discount', y='likes_count', data=test_set_accessories, color='red')
plt.title(' accessories Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']
# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_accessories[var])
plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = df[column].clip(lower_bound, upper_bound)
return df
# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
test_set_accessories = cap_outliers(test_set_accessories, column)
# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_accessories[var])
plt.title(f'Box Plot of {var} (Outliers Treated)')
plt.tight_layout()
plt.show()
import sklearn
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']
# Apply label encoding to each categorical variable
for var in categorical_vars:
test_set_accessories[var] = label_encoder.fit_transform(test_set_accessories[var])
test_set_accessories.head()
| category | subcategory | name | current_price | raw_price | discount | likes_count | |
|---|---|---|---|---|---|---|---|
| 640 | accessories | 8 | 238 | 24.5375 | 48.86 | 52 | 204.0 |
| 2700 | accessories | 20 | 118 | 8.1700 | 14.86 | 45 | 99.0 |
| 5430 | accessories | 28 | 417 | 10.1300 | 23.87 | 58 | 52.0 |
| 6156 | accessories | 2 | 152 | 16.4200 | 37.96 | 57 | 25.0 |
| 3562 | accessories | 9 | 151 | 11.5800 | 20.61 | 44 | 61.0 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Create discount bins
test_set_accessories['discount_bin'] = pd.cut(test_set_accessories['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])
# Calculate mean likes per discount bin
test_set_discount_likes = test_set_accessories.groupby('discount_bin')['likes_count'].mean().sort_values()
# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for accessories')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_accessories, color='blue', alpha=.8)
plt.title('Relationship Between Price and Likes Count for accessories ')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract current_price and likes_count columns
current_price = test_set_accessories['current_price']
likes_count = test_set_accessories['likes_count']
# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_accessories, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for accessories ')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract discount and likes_count columns
discount = test_set_accessories['discount']
likes_count = test_set_accessories['likes_count']
# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of discount
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), ' Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), ' Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), ' Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), ' Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Select features for clustering (price and discount)
X = test_set_accessories[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_accessories['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_accessories.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Assuming 'data_cleaned' is your DataFrame
# Select features for clustering (price and discount)
X = test_set_accessories[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42) # Adjust n_clusters as needed
test_set_accessories['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_accessories.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3. warnings.warn( C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3. warnings.warn(
bags = pd.read_csv('bags.csv')
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
bags = pd.read_csv('bags.csv')
# Split the dataset
train_set_bags, test_set_bags = train_test_split(bags, test_size=0.1, random_state=42)
# Display the number of rows in each set
print("Training set bags size:", len(train_set_bags))
print("Testing set bags size:", len(test_set_bags))
Training set bags size: 5641 Testing set bags size: 627
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_bags, order=test_set_bags['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Bags')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()
# Get the counts of each subcategory
subcategory_counts = test_set_bags['subcategory'].value_counts()
# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total bags ',total_subcategories)
subcategory Sac bandoulière 244 Portefeuilles 141 Sacs à main 84 Sacs à dos 78 Sacs chic 28 Sacs de voyage 11 Cosmetic Bags 10 Étui & Sac des monnaies 7 Men's Bags 7 Sacs de rangement & Trousses 6 Name: count, dtype: int64 Total bags 627
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming 'data_cleaned' is your DataFrame
# Get the counts of each subcategory
subcategory_counts = test_set_bags['subcategory'].value_counts()
# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_bags['subcategory'],
order=bottom_10_subcategories.index,
palette='viridis')
plt.title('Bags Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()
# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory Sacs de voyage 11 Cosmetic Bags 10 Étui & Sac des monnaies 7 Men's Bags 7 Sacs de rangement & Trousses 6 Pochettes 5 Porte-documents 3 Pochettes & Clutches 1 Sacs cosmétiques 1 Bag Accessories 1 Name: count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_bags.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Bags with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Pochettes & Clutches 8 Bag Accessories 179 Porte-documents 323 Men's Bags 450 Sacs de rangement & Trousses 1218 Pochettes 1361 Étui & Sac des monnaies 1751 Sacs chic 2597 Cosmetic Bags 3743 Sacs de voyage 7116 Name: likes_count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_bags.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Bags with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Sac bandoulière 56711 Sacs à main 31328 Portefeuilles 20123 Sacs à dos 16607 Sacs cosmétiques 7266 Sacs de voyage 7116 Cosmetic Bags 3743 Sacs chic 2597 Étui & Sac des monnaies 1751 Pochettes 1361 Name: likes_count, dtype: int64
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_bags['current_price'], kde=True, bins=20, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_bags, color='red')
plt.title('bags Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract discount and likes_count columns
discount = test_set_bags['discount']
likes_count = test_set_bags['likes_count']
# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of discount
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), ' Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), ' Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), ' Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), ' Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']
# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_bags[var])
plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = df[column].clip(lower_bound, upper_bound)
return df
# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
test_set_bags = cap_outliers(test_set_bags, column)
# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_bags[var])
plt.title(f'Box Plot of {var} (Outliers Treated)')
plt.tight_layout()
plt.show()
import sklearn
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']
# Apply label encoding to each categorical variable
for var in categorical_vars:
test_set_bags[var] = label_encoder.fit_transform(test_set_bags[var])
test_set_bags.head()
| category | subcategory | name | current_price | raw_price | discount | likes_count | |
|---|---|---|---|---|---|---|---|
| 3158 | bags | 7 | 147 | 34.99 | 75.00 | 53.0 | 2.0 |
| 1345 | bags | 14 | 215 | 8.99 | 18.00 | 50.0 | 6.0 |
| 6110 | bags | 7 | 115 | 7.45 | 21.83 | 66.0 | 195.0 |
| 5532 | bags | 7 | 496 | 22.39 | 108.99 | 78.5 | 106.0 |
| 2521 | bags | 6 | 234 | 9.80 | 20.57 | 52.0 | 165.0 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Create discount bins
test_set_bags['discount_bin'] = pd.cut(test_set_bags['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])
# Calculate mean likes per discount bin
discount_likes = test_set_bags.groupby('discount_bin')['likes_count'].mean().sort_values()
# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for bags')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_bags, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for bags')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract current_price and likes_count columns
current_price = test_set_bags['current_price']
likes_count = test_set_bags['likes_count']
# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Select features for clustering (price and discount)
X = test_set_bags[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42)
test_set_bags['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_bags.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Assuming 'data_cleaned' is your DataFrame
# Select features for clustering (price and discount)
X = test_set_bags[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=3, random_state=42) # Adjust n_clusters as needed
test_set_bags['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_bags.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3. warnings.warn( C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=3. warnings.warn(
beauty = pd.read_csv('beauty.csv')
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
beauty = pd.read_csv('beauty.csv')
# Split the dataset
train_set_beauty, test_set_beauty = train_test_split(beauty, test_size=0.1, random_state=42)
# Display the number of rows in each set
print("Training set beauty size:", len(train_set_beauty))
print("Testing set beauty size:", len(test_set_beauty))
Training set beauty size: 3423 Testing set beauty size: 381
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_beauty, order=test_set_beauty['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Beauty')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()
# Get the counts of each subcategory
subcategory_counts = test_set_beauty['subcategory'].value_counts()
# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total beauty ',total_subcategories)
subcategory Soin visage 19 Fard à paupières 13 Perruques synthétiques 13 Vêtements minceur 12 Trousses 11 Vernis à ongles 11 Soins des pieds 11 Autres outils 10 Sports Equipments 10 Accessoires soin visage 10 Name: count, dtype: int64 Total beauty 381
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming 'data_cleaned' is your DataFrame
# Get the counts of each subcategory
subcategory_counts = test_set_beauty['subcategory'].value_counts()
# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_beauty['subcategory'],
order=bottom_10_subcategories.index,
palette='viridis')
plt.title(' Beauty Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()
# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory Stylo rouge à lèvres 1 Démaquillage 1 Soins du visage 1 Pochoirs pour ongles 1 Oils 1 Polissoir pour ongles 1 Colle à ongles 1 Sèche-cheveux 1 Déodorants & Anhidrotiques 1 Organisateur cosmétique en acrylique 1 Name: count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_beauty.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Beauty with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Polissoir pour ongles 10 Spa & Aromathérapie & Diffuseurs 33 Pochoirs pour ongles 37 Sèche-cheveux 59 Démaquillage 60 Daily Necessities 77 Lisseurs des cheveux 79 Déodorants & Anhidrotiques 80 Soins des mains 85 Ciseaux & Cisailles de coiffure 91 Name: likes_count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_beauty.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Beauty with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Gloss à lèvres 8936 Vêtements minceur 3670 Correcteur 2957 Soin visage 2932 Faux cils 2667 Soins des pieds 2508 Rouge à lèvres 2272 Traitement des Cheveux & Cuir chevelu 2206 Trousses 1965 Vernis à ongles 1655 Name: likes_count, dtype: int64
# Plot: Distribution of current prices
plt.figure(figsize=(12, 8))
sns.histplot(test_set_beauty['current_price'], kde=True, bins=75, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_beauty, color='red')
plt.title('beauty Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract discount and likes_count columns
discount = test_set_beauty['discount']
likes_count = test_set_beauty['likes_count']
# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of discount
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), ' Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), ' Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), ' Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), ' Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']
# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_beauty[var])
plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = df[column].clip(lower_bound, upper_bound)
return df
# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
test_set_beauty = cap_outliers(test_set_beauty, column)
# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_beauty[var])
plt.title(f'Box Plot of {var} (Outliers Treated)')
plt.tight_layout()
plt.show()
import sklearn
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']
# Apply label encoding to each categorical variable
for var in categorical_vars:
test_set_beauty[var] = label_encoder.fit_transform(test_set_beauty[var])
test_set_beauty.head()
| category | subcategory | name | current_price | raw_price | discount | likes_count | |
|---|---|---|---|---|---|---|---|
| 2014 | beauty | 21 | 274 | 51.19 | 89.990 | 43.0 | 42.0 |
| 527 | beauty | 40 | 41 | 30.46 | 52.730 | 42.0 | 391.5 |
| 478 | beauty | 42 | 297 | 51.19 | 103.965 | 50.0 | 32.0 |
| 1684 | beauty | 6 | 177 | 9.19 | 18.660 | 51.0 | 33.0 |
| 1653 | beauty | 53 | 55 | 15.12 | 30.450 | 50.0 | 310.0 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Create discount bins
test_set_beauty['discount_bin'] = pd.cut(test_set_beauty['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])
# Calculate mean likes per discount bin
discount_likes = test_set_beauty.groupby('discount_bin')['likes_count'].mean().sort_values()
# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for beauty')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_beauty, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for beauty')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract current_price and likes_count columns
current_price = test_set_beauty['current_price']
likes_count = test_set_beauty['likes_count']
# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Select features for clustering (price and discount)
X = test_set_beauty[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_beauty['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_beauty.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Assuming 'data_cleaned' is your DataFrame
# Select features for clustering (price and discount)
X = test_set_beauty[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42) # Adjust n_clusters as needed
test_set_beauty['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_beauty.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
house = pd.read_csv('house.csv')
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
house = pd.read_csv('house.csv')
# Split the dataset
train_set_house, test_set_house = train_test_split(house, test_size=0.1, random_state=42)
# Display the number of rows in each set
print("Training set house size:", len(train_set_house))
print("Testing set house size:", len(test_set_house))
Training set house size: 11511 Testing set house size: 1280
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_house, order=test_set_house['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Household items')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()
# Get the counts of each subcategory
subcategory_counts = test_set_house['subcategory'].value_counts()
# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total house ',total_subcategories)
subcategory Housses de coussin 66 Jouets Squishy 52 Sacs de voyage & shopping 29 Sacs d'organisation de maison 26 Literie 25 Coussins & Oreillers 22 Autocollants de murs 21 Boîte de stockage 21 Sacs de ligne & cosmétique 20 Flowers 19 Name: count, dtype: int64 Total house 1280
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming 'data_cleaned' is your DataFrame
# Get the counts of each subcategory
subcategory_counts = test_set_house['subcategory'].value_counts()
# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_house['subcategory'],
order=bottom_10_subcategories.index,
palette='viridis')
plt.title('Household Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()
# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory Tongs 1 Sécurité de salle de bain 1 Fournitures d'envoi 1 Pailles 1 Alarme Smart 1 Maisons & Cages 1 Terrariums 1 Tool Sets 1 Lampes de toilette 1 Parapluies & Parasol 1 Name: count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_house.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Household with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Flowers 12510 Jouets Squishy 10215 Sacs d'organisation de maison 8753 Couture 7294 Housses de coussin 7025 Sacs de voyage & shopping 5551 Literie 4613 Coussins & Oreillers 4264 Fruits 4194 Home Carpets 3773 Name: likes_count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_house.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Household with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Lighting Accessories 2 Ventilateur 7 Vêtements de pluie & Parapluie 8 Arts & Artisanat & Couture 8 Pailles 9 Décoration de soirée 9 Ballons & Accessoires 9 Pots & Planters 11 Terrariums 12 Sacs & Boîtes de bonbons 13 Name: likes_count, dtype: int64
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_house['current_price'], kde=True, bins=200, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Current Price')
plt.ylabel('Frequency')
plt.show()
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_house, color='red')
plt.title('house Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract discount and likes_count columns
discount = test_set_house['discount']
likes_count = test_set_house['likes_count']
# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of discount
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), ' Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), ' Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), ' Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), ' Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']
# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_house[var])
plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = df[column].clip(lower_bound, upper_bound)
return df
# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
test_set_house = cap_outliers(test_set_house, column)
# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_house[var])
plt.title(f'Box Plot of {var} (Outliers Treated)')
plt.tight_layout()
plt.show()
import sklearn
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']
# Apply label encoding to each categorical variable
for var in categorical_vars:
test_set_house[var] = label_encoder.fit_transform(test_set_house[var])
test_set_house.head()
| category | subcategory | name | current_price | raw_price | discount | likes_count | |
|---|---|---|---|---|---|---|---|
| 4091 | house | 174 | 544 | 5.76 | 20.59 | 72 | 27.0 |
| 3921 | house | 204 | 109 | 3.29 | 7.11 | 54 | 211.0 |
| 1355 | house | 8 | 490 | 24.99 | 43.99 | 43 | 31.0 |
| 6109 | house | 21 | 658 | 26.80 | 60.92 | 56 | 61.0 |
| 4132 | house | 135 | 84 | 4.79 | 9.99 | 52 | 57.0 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Create discount bins
test_set_house['discount_bin'] = pd.cut(test_set_house['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])
# Calculate mean likes per discount bin
discount_likes = test_set_house.groupby('discount_bin')['likes_count'].mean().sort_values()
# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for house')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_house, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for house')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract current_price and likes_count columns
current_price = test_set_house['current_price']
likes_count = test_set_house['likes_count']
# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Select features for clustering (price and discount)
X = test_set_house[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_house['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_house.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Assuming 'data_cleaned' is your DataFrame
# Select features for clustering (price and discount)
X = test_set_house[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42) # Adjust n_clusters as needed
test_set_house['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_house.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
jewelry = pd.read_csv('jewelry.csv')
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
jewelry = pd.read_csv('jewelry.csv')
# Split the dataset
train_set_jewelry, test_set_jewelry = train_test_split(jewelry, test_size=0.1, random_state=42)
# Display the number of rows in each set
print("Training set jewelry size:", len(train_set_jewelry))
print("Testing set jewelry size:", len(test_set_jewelry))
Training set jewelry size: 4367 Testing set jewelry size: 486
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_jewelry, order=test_set_jewelry['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Jewelry ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()
# Get the counts of each subcategory
subcategory_counts = test_set_jewelry['subcategory'].value_counts()
# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total jewelry ',total_subcategories)
subcategory Boucles d'oreilles 112 Colliers 78 Bracelets 52 Bagues 50 Montres pour homme 35 Montres pour femme 27 Accessoires des cheveux 16 Sets de bijoux 16 Montres connectées 13 Bracelets pour homme 12 Name: count, dtype: int64 Total jewelry 486
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming 'data_cleaned' is your DataFrame
# Get the counts of each subcategory
subcategory_counts = test_set_jewelry['subcategory'].value_counts()
# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_jewelry['subcategory'],
order=bottom_10_subcategories.index,
palette='viridis')
plt.title('Jewelery Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()
# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory Fine Copper2 2 Boucles d'oreilles pour homme 2 Coffrets & Sacs de bijoux 2 Colliers de couple 2 Présentoirs de bijoux 2 Accessoires 2 Mascarade & Cosplay 2 Montres couple 1 Montres de poche 1 Bagues de couple 1 Name: count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_jewelry.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Jewelry with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Mascarade & Cosplay 72 Accessoires 85 Montres de poche 87 Montres couple 121 Bijoux de corps 193 Présentoirs de bijoux 226 Boucles d'oreilles pour homme 280 Coffrets & Sacs de bijoux 289 Colliers de couple 346 Broches et épingles 358 Name: likes_count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_jewelry.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Jewelry with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Boucles d'oreilles 20337 Colliers 18895 Bagues 11507 Bracelets 7153 Montres pour homme 3786 Bagues pour homme 3319 Montres pour femme 2991 Accessoires des cheveux 2583 Colliers pour homme 2184 Bracelets de cheville 1758 Name: likes_count, dtype: int64
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_jewelry['current_price'], kde=True, bins=100, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_jewelry, color='red')
plt.title('jewelry Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract discount and likes_count columns
discount = test_set_jewelry['discount']
likes_count = test_set_jewelry['likes_count']
# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of discount
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), ' Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), ' Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), ' Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), ' Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']
# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_jewelry[var])
plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = df[column].clip(lower_bound, upper_bound)
return df
# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
test_set_jewelry = cap_outliers(test_set_jewelry, column)
# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_jewelry[var])
plt.title(f'Box Plot of {var} (Outliers Treated)')
plt.tight_layout()
plt.show()
import sklearn
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']
# Apply label encoding to each categorical variable
for var in categorical_vars:
test_set_jewelry[var] = label_encoder.fit_transform(test_set_jewelry[var])
test_set_jewelry.head()
| category | subcategory | name | current_price | raw_price | discount | likes_count | |
|---|---|---|---|---|---|---|---|
| 3209 | jewelry | 2 | 317 | 10.66 | 25.39000 | 58 | 58.0 |
| 1684 | jewelry | 12 | 481 | 9.75 | 19.82000 | 51 | 185.0 |
| 1044 | jewelry | 9 | 206 | 11.49 | 18.74000 | 39 | 35.0 |
| 4813 | jewelry | 24 | 417 | 25.08 | 47.98625 | 49 | 118.0 |
| 1538 | jewelry | 23 | 437 | 25.49 | 47.98625 | 50 | 64.0 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Create discount bins
test_set_jewelry['discount_bin'] = pd.cut(test_set_jewelry['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])
# Calculate mean likes per discount bin
discount_likes = test_set_jewelry.groupby('discount_bin')['likes_count'].mean().sort_values()
# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for beauty')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_jewelry, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for jewelry')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract current_price and likes_count columns
current_price = test_set_jewelry['current_price']
likes_count = test_set_jewelry['likes_count']
# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Select features for clustering (price and discount)
X = test_set_jewelry[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_jewelry['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_jewelry.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Assuming 'data_cleaned' is your DataFrame
# Select features for clustering (price and discount)
X = test_set_jewelry[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42) # Adjust n_clusters as needed
test_set_jewelry['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_jewelry.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
kids = pd.read_csv('kids.csv')
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
kids = pd.read_csv('kids.csv')
# Split the dataset
train_set_kids, test_set_kids = train_test_split(kids, test_size=0.1, random_state=42)
# Display the number of rows in each set
print("Training set kids size:", len(train_set_kids))
print("Testing set kids size:", len(test_set_kids))
Training set kids size: 3676 Testing set kids size: 409
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_kids, order=test_set_kids['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Kids category products ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()
# Get the counts of each subcategory
subcategory_counts = test_set_kids['subcategory'].value_counts()
# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total kids ',total_subcategories)
subcategory Robes 86 Brassières de grossesse 58 Costume & Jupe-culotte 34 Salopettes & Combinaisons 29 Costumes pour bébé 21 Chaussures pour fille 15 Coutumes & Co-ords 14 Tops 14 Tops & Tees 13 Blousons & Vestes 12 Name: count, dtype: int64 Total kids 409
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming 'data_cleaned' is your DataFrame
# Get the counts of each subcategory
subcategory_counts = test_set_kids['subcategory'].value_counts()
# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_kids['subcategory'],
order=bottom_10_subcategories.index,
palette='viridis')
plt.title('Kids Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()
# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory Sacs à dos 2 Pantalons & Capris 2 Imperméables 1 Colliers & Pendentifs 1 Sandals 1 Chaussures pour enfant 1 Slippers 1 Trousses 1 Chaussettes 1 Écharpes 1 Name: count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_kids.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Kid with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Pulls & Sweat-shirts 0 Slippers 2 Trainers 4 Sandals 6 Flats & Loafers 9 Sneakers 17 Écharpes 22 Colliers & Pendentifs 23 Pantalons & Capris 44 Chaussures pour enfant 50 Name: likes_count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_kids.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Kid with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Robes 15109 Brassières de grossesse 7471 Costume & Jupe-culotte 4186 Salopettes & Combinaisons 2635 Chaussures pour fille 1994 Coutumes & Co-ords 1559 Pantalons & Jupes 1233 Costumes pour bébé 1222 Chaussures pour garçon 1168 Trousses 1051 Name: likes_count, dtype: int64
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_kids['current_price'], kde=True, bins=30, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_kids, color='red')
plt.title('kids Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract discount and likes_count columns
discount = test_set_kids['discount']
likes_count = test_set_kids['likes_count']
# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of discount
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), ' Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), ' Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), ' Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), ' Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']
# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_kids[var])
plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = df[column].clip(lower_bound, upper_bound)
return df
# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
test_set_kids = cap_outliers(test_set_kids, column)
# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_kids[var])
plt.title(f'Box Plot of {var} (Outliers Treated)')
plt.tight_layout()
plt.show()
import sklearn
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']
# Apply label encoding to each categorical variable
for var in categorical_vars:
test_set_kids[var] = label_encoder.fit_transform(test_set_kids[var])
test_set_kids.head()
| category | subcategory | name | current_price | raw_price | discount | likes_count | |
|---|---|---|---|---|---|---|---|
| 599 | kids | 24 | 238 | 34.580 | 87.14 | 60.0 | 192 |
| 752 | kids | 18 | 143 | 19.050 | 47.99 | 60.0 | 43 |
| 2016 | kids | 15 | 20 | 22.060 | 55.57 | 60.0 | 60 |
| 1001 | kids | 24 | 249 | 38.745 | 69.99 | 42.5 | 176 |
| 2514 | kids | 10 | 73 | 22.530 | 57.58 | 61.0 | 146 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Create discount bins
test_set_kids['discount_bin'] = pd.cut(test_set_kids['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])
# Calculate mean likes per discount bin
discount_likes = test_set_kids.groupby('discount_bin')['likes_count'].mean().sort_values()
# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for kids')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_kids, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for kids')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract current_price and likes_count columns
current_price = test_set_kids['current_price']
likes_count = test_set_kids['likes_count']
# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Select features for clustering (price and discount)
X = test_set_kids[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_kids['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_kids.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Assuming 'data_cleaned' is your DataFrame
# Select features for clustering (price and discount)
X = test_set_kids[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42) # Adjust n_clusters as needed
test_set_kids['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_kids.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn( C:\Users\santa\AppData\Roaming\Python\Python311\site-packages\sklearn\cluster\_kmeans.py:1429: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=2. warnings.warn(
shoes = pd.read_csv('shoes.csv')
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
shoes = pd.read_csv('shoes.csv')
# Split the dataset
train_set_shoes, test_set_shoes = train_test_split(shoes, test_size=0.1, random_state=42)
# Display the number of rows in each set
print("Training set shoes size:", len(train_set_shoes))
print("Testing set shoes size:", len(test_set_shoes))
Training set shoes size: 10640 Testing set shoes size: 1183
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_shoes, order=test_set_shoes['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Shoes ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()
# Get the counts of each subcategory
subcategory_counts = test_set_shoes['subcategory'].value_counts()
# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total Shoes ',total_subcategories)
subcategory Bottes & Bottines 181 Mocassins 177 Derbies & Mocassins 167 Baskets 136 Sandales & Mules 118 Sneakers & Baskets 98 Sandales 70 Bottes & Chaussures montantes 66 Chaussures de ville 45 Escarpins 36 Name: count, dtype: int64 Total Shoes 1183
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming 'data_cleaned' is your DataFrame
# Get the counts of each subcategory
subcategory_counts = test_set_shoes['subcategory'].value_counts()
# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_shoes['subcategory'],
order=bottom_10_subcategories.index,
palette='viridis')
plt.title('Shoes Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()
# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory Escarpins 36 Claquettes & Tongs 35 Chaussons 20 Slipper 14 Plateforme 7 Chaussures de sport 4 Sandals 3 Pumps 2 ACCESSOIRES CHAUSSURES 2 Flat & Loafers 2 Name: count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_shoes.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Shoes with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Bottes & Bottines 77694 Derbies & Mocassins 76947 Sandales & Mules 46361 Mocassins 26790 Escarpins 23465 Sneakers & Baskets 20580 Baskets 13067 Chaussures de ville 12249 Bottes & Chaussures montantes 11233 Sandales 9621 Name: likes_count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_shoes.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Shoes with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Flat & Loafers 99 Pumps 187 Sandals 222 Chaussures de sport 497 Slipper 504 ACCESSOIRES CHAUSSURES 649 Plateforme 1062 Chaussons 1285 Claquettes & Tongs 6003 Sandales 9621 Name: likes_count, dtype: int64
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_shoes['current_price'], kde=True, bins=30, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_shoes, color='red')
plt.title('shoes Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract discount and likes_count columns
discount = test_set_shoes['discount']
likes_count = test_set_shoes['likes_count']
# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of discount
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), ' Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), ' Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), ' Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), ' Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']
# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_shoes[var])
plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = df[column].clip(lower_bound, upper_bound)
return df
# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
test_set_shoes = cap_outliers(test_set_shoes, column)
# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_shoes[var])
plt.title(f'Box Plot of {var} (Outliers Treated)')
plt.tight_layout()
plt.show()
import sklearn
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']
# Apply label encoding to each categorical variable
for var in categorical_vars:
test_set_shoes[var] = label_encoder.fit_transform(test_set_shoes[var])
test_set_shoes.head()
| category | subcategory | name | current_price | raw_price | discount | likes_count | |
|---|---|---|---|---|---|---|---|
| 8169 | shoes | 15 | 1056 | 31.94 | 64.64 | 51 | 87 |
| 900 | shoes | 8 | 297 | 62.03 | 107.70 | 42 | 93 |
| 8075 | shoes | 14 | 1006 | 56.69 | 80.08 | 33 | 423 |
| 7625 | shoes | 14 | 1025 | 38.69 | 93.59 | 59 | 31 |
| 2816 | shoes | 1 | 1103 | 25.19 | 69.22 | 64 | 14 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Create discount bins
test_set_shoes['discount_bin'] = pd.cut(test_set_shoes['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])
# Calculate mean likes per discount bin
discount_likes = test_set_shoes.groupby('discount_bin')['likes_count'].mean().sort_values()
# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for shoes')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_shoes, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for shoes')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract current_price and likes_count columns
current_price = test_set_shoes['current_price']
likes_count = test_set_shoes['likes_count']
# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Select features for clustering (price and discount)
X = test_set_shoes[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_shoes['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_shoes.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Assuming 'data_cleaned' is your DataFrame
# Select features for clustering (price and discount)
X = test_set_shoes[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42) # Adjust n_clusters as needed
test_set_shoes['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_shoes.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
women = pd.read_csv('women.csv')
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
women = pd.read_csv('women.csv')
# Split the dataset
train_set_women, test_set_women = train_test_split(women, test_size=0.1, random_state=42)
# Display the number of rows in each set
print("Training set women size:", len(train_set_women))
print("Testing set women size:", len(test_set_women))
Training set women size: 13328 Testing set women size: 1481
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_women, order=test_set_women['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Women Catergory Products ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()
# Get the counts of each subcategory
subcategory_counts = test_set_women['subcategory'].value_counts()
# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total women ',total_subcategories)
subcategory Chemises 164 Blouses & Chemises 162 Robes imprimées 131 T-shirts 128 Soutiens-gorge 104 Pantalons & Shorts 85 Robes décontractées 84 Culotte haute 64 Vestes & Gilets 63 Pulls & Cardigans 53 Name: count, dtype: int64 Total women 1481
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming 'data_cleaned' is your DataFrame
# Get the counts of each subcategory
subcategory_counts = test_set_women['subcategory'].value_counts()
# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_women['subcategory'],
order=bottom_10_subcategories.index,
palette='viridis')
plt.title('Women Bottom 10 Product Categories ')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()
# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory Survêtements 2 Bas 2 Shorts 1 Blazers 1 Combinaison 1 Onesies 1 MANTEAUX & PULLS 1 Tops 1 Shortys 1 Sweats 1 Name: count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_women.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Women with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Blouses & Chemises 46617 Soutiens-gorge 46451 Chemises 40862 Robes imprimées 23102 Pantalons & Shorts 21939 T-shirts 20485 Robes décontractées 20397 Vestes & Gilets 15063 Robes vintage 14415 Combinaisons & Grenouillères 10702 Name: likes_count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_women.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Women with Least Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Tops 15 Shorts 23 Blazers 33 MANTEAUX & PULLS 68 Onesies 73 Shortys 89 Combinaison 127 Survêtements 138 Robes en dentelle 139 Boho Dresses 162 Name: likes_count, dtype: int64
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_women['current_price'], kde=True, bins=30, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_women, color='red')
plt.title('women Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract discount and likes_count columns
discount = test_set_women['discount']
likes_count = test_set_women['likes_count']
# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of discount
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), ' Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), ' Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), ' Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), ' Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']
# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_women[var])
plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = df[column].clip(lower_bound, upper_bound)
return df
# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
test_set_women = cap_outliers(test_set_women, column)
# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_women[var])
plt.title(f'Box Plot of {var} (Outliers Treated)')
plt.tight_layout()
plt.show()
import sklearn
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']
# Apply label encoding to each categorical variable
for var in categorical_vars:
test_set_women[var] = label_encoder.fit_transform(test_set_women[var])
test_set_women.head()
| category | subcategory | name | current_price | raw_price | discount | likes_count | |
|---|---|---|---|---|---|---|---|
| 12853 | women | 47 | 1410 | 29.64 | 57.990 | 49 | 429.0 |
| 11804 | women | 38 | 7 | 21.10 | 40.910 | 48 | 125.0 |
| 2662 | women | 1 | 13 | 14.29 | 29.230 | 51 | 28.0 |
| 11887 | women | 15 | 576 | 45.49 | 95.715 | 50 | 31.0 |
| 14799 | women | 49 | 1455 | 19.99 | 39.990 | 50 | 15.0 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Create discount bins
test_set_women['discount_bin'] = pd.cut(test_set_women['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])
# Calculate mean likes per discount bin
discount_likes = test_set_women.groupby('discount_bin')['likes_count'].mean().sort_values()
# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for women')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_women, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for women')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract current_price and likes_count columns
current_price = test_set_women['current_price']
likes_count = test_set_women['likes_count']
# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Select features for clustering (price and discount)
X = test_set_women[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_women['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_women.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Assuming 'data_cleaned' is your DataFrame
# Select features for clustering (price and discount)
X = test_set_women[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42) # Adjust n_clusters as needed
test_set_women['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_women.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
men = pd.read_csv('men.csv')
import pandas as pd
from sklearn.model_selection import train_test_split
# Load the dataset
men = pd.read_csv('men.csv')
# Split the dataset
train_set_men, test_set_men = train_test_split(men, test_size=0.1, random_state=42)
# Display the number of rows in each set
print("Training set men size:", len(train_set_men))
print("Testing set men size:", len(test_set_men))
Training set men size: 9187 Testing set men size: 1021
import matplotlib.pyplot as plt
import seaborn as sns
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of product categories
plt.figure(figsize=(10, 6))
sns.countplot(y='subcategory', data=test_set_men, order=test_set_men['subcategory'].value_counts().index[:10], palette='viridis')
plt.title('Top 10 Men Category Products ')
plt.xlabel('Count')
plt.ylabel('subcategory')
plt.show()
# Get the counts of each subcategory
subcategory_counts = test_set_men['subcategory'].value_counts()
# Get the Top 10 subcategories
top_10_subcategories = subcategory_counts.head(10)
total_subcategories = subcategory_counts.sum()
# Display the Top 10 subcategories and their counts
print(top_10_subcategories)
print('Total men ',total_subcategories)
subcategory Shirts 274 T-Shirts 162 Boxers 91 Vestes 50 Pantalons 45 Hoodies 42 Shorts de bain 41 Pyjama 40 Henley Shirts 38 Slips 32 Name: count, dtype: int64 Total men 1021
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
# Assuming 'data_cleaned' is your DataFrame
# Get the counts of each subcategory
subcategory_counts = test_set_men['subcategory'].value_counts()
# Get the bottom 10 subcategories
bottom_10_subcategories = subcategory_counts.tail(10)
# Set the style for the plots
sns.set(style="whitegrid")
# Plot: Distribution of bottom 10 product categories
plt.figure(figsize=(10, 6))
sns.countplot(y=test_set_men['subcategory'],
order=bottom_10_subcategories.index,
palette='viridis')
plt.title('Men Bottom 10 Product Categories')
plt.xlabel('Count')
plt.ylabel('Subcategory')
plt.show()
# Display the bottom 10 subcategories and their counts
print(bottom_10_subcategories)
subcategory Doudounes & Parkas 5 Tanks 4 Jumpsuits 3 Robes 2 Waistcoats 2 Débardeurs 2 VESTES & MANTEAUX 1 PULLS & GILETS 1 Trousers 1 Onesies 1 Name: count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_men.groupby('subcategory')['likes_count'].sum().sort_values(ascending=True)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Men with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Robes 80 Tanks 105 SOUS-VÊTEMENTS 258 PULLS & GILETS 259 Onesies 274 Trousers 320 Waistcoats 338 Bottoms 377 VESTES & MANTEAUX 485 Débardeurs 645 Name: likes_count, dtype: int64
# Group by subcategory and calculate the total likes for each subcategory
subcategory_likes = test_set_men.groupby('subcategory')['likes_count'].sum().sort_values(ascending=False)
# Get the top 10 subcategories with the highest likes count
top_10_subcategory_likes = subcategory_likes[:10]
# Create a bar plot for the top 10 subcategories vs likes count
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_subcategory_likes.values, y=top_10_subcategory_likes.index, palette='viridis')
plt.title('Top 10 Men with Highest Likes Count')
plt.xlabel('Total Likes Count')
plt.ylabel('Subcategory')
plt.show()
# Display the top 10 subcategories and their likes count
# Changed from 'print(top_10_subcategories)' to 'print(top_10_subcategory_likes)'
print(top_10_subcategory_likes)
subcategory Shirts 51149 T-Shirts 20691 Pantalons 17314 Henley Shirts 14051 Hoodies 10073 Vestes 9498 Boxers 6241 Shorts 4464 Slips 3932 Pyjama 3900 Name: likes_count, dtype: int64
# Plot: Distribution of current prices
plt.figure(figsize=(10, 6))
sns.histplot(test_set_men['current_price'], kde=True, bins=30, color='blue')
plt.title('Distribution of Current Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()
# Plot: Scatter plot of discount vs. likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='discount', y='likes_count', data=test_set_men, color='red')
plt.title('men Discount vs. Likes Count')
plt.xlabel('Discount (%)')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract discount and likes_count columns
discount = test_set_men['discount']
likes_count = test_set_men['likes_count']
# Calculate the midpoints for discount and likes_count
mid_discount = (discount.max() + discount.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_discount, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of discount
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Equal Quadrants by Mid Value')
plt.xlabel('Discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_discount + (mid_discount * 0.05), mid_likes + (mid_likes * 0.05), ' Q1', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes + (mid_likes * 0.05), ' Q2', fontsize=14, color='red')
plt.text(mid_discount - (mid_discount * 0.4), mid_likes - (mid_likes * 0.4), ' Q3', fontsize=14, color='red')
plt.text(mid_discount + (mid_discount * 0.05), mid_likes - (mid_likes * 0.4), ' Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for discount and likes_count
mean_discount = discount.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(discount, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_discount, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of discount vs Likes Count with Four Quadrants by Mean')
plt.xlabel('discount')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_discount + (mean_discount * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_discount - (mean_discount * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_discount + (mean_discount * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Identify continuous variables
continuous_vars = ['current_price', 'raw_price', 'discount', 'likes_count']
# Create box plots for each continuous variable
plt.figure(figsize=(15, 10))
for i, var in enumerate(continuous_vars):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_men[var])
plt.title(f'Box Plot of {var}')
plt.tight_layout()
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Define a function to cap outliers using min and max values
def cap_outliers(df, column):
Q1 = df[column].quantile(0.25)
Q3 = df[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
df[column] = df[column].clip(lower_bound, upper_bound)
return df
# Apply the function to the columns with outliers
for column in ['current_price', 'raw_price', 'discount', 'likes_count']:
test_set_men = cap_outliers(test_set_men, column)
# Create box plots for each continuous variable after outlier treatment
plt.figure(figsize=(15, 10))
for i, var in enumerate(['current_price', 'raw_price', 'discount', 'likes_count']):
plt.subplot(2, 2, i + 1)
sns.boxplot(y=test_set_men[var])
plt.title(f'Box Plot of {var} (Outliers Treated)')
plt.tight_layout()
plt.show()
import sklearn
from sklearn.preprocessing import LabelEncoder
# Create a LabelEncoder object
label_encoder = LabelEncoder()
# Identify non-continuous (categorical) variables
categorical_vars = ['subcategory','name']
# Apply label encoding to each categorical variable
for var in categorical_vars:
test_set_men[var] = label_encoder.fit_transform(test_set_men[var])
test_set_men.head()
| category | subcategory | name | current_price | raw_price | discount | likes_count | |
|---|---|---|---|---|---|---|---|
| 5062 | men | 21 | 276 | 30.79 | 45.260 | 32 | 361.5 |
| 2405 | men | 22 | 464 | 31.19 | 64.220 | 51 | 92.0 |
| 9478 | men | 6 | 988 | 50.74 | 81.565 | 36 | 42.0 |
| 8713 | men | 29 | 917 | 17.99 | 35.990 | 50 | 11.0 |
| 9085 | men | 21 | 114 | 40.69 | 53.990 | 25 | 361.5 |
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Create discount bins
test_set_men['discount_bin'] = pd.cut(test_set_men['discount'], bins=[0, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100],
labels=['0-10', '10-20', '20-30', '30-40', '40-50', '50-60', '60-70', '70-80', '80-90', '90-100'])
# Calculate mean likes per discount bin
discount_likes = test_set_men.groupby('discount_bin')['likes_count'].mean().sort_values()
# Plotting the effect of discount on popularity
plt.figure(figsize=(10, 6))
sns.barplot(x=discount_likes.index, y=discount_likes.values, palette='coolwarm')
plt.title('Effect of Discount on Likes Count for men')
plt.xlabel('Discount Range (%)')
plt.ylabel('Average Likes Count')
plt.xticks(rotation=45, ha='right') # Rotate x-axis labels for better readability
plt.tight_layout() # Adjust layout to prevent labels from overlapping
plt.show()
import matplotlib.pyplot as plt
import seaborn as sns
# Assuming 'data_cleaned' is your DataFrame
# Plotting the relationship between price and likes count
plt.figure(figsize=(10, 6))
sns.scatterplot(x='current_price', y='likes_count', data=test_set_men, color='blue', alpha=0.6)
plt.title('Relationship Between Price and Likes Count for men')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
# Extract current_price and likes_count columns
current_price = test_set_men['current_price']
likes_count = test_set_men['likes_count']
# Calculate the midpoints for current_price and likes_count
mid_price = (current_price.max() + current_price.min()) / 2
mid_likes = (likes_count.max() + likes_count.min()) / 2
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='blue', alpha=0.3)
# Add lines at the midpoints to divide the quadrants
plt.axhline(mid_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at the midpoint of likes
plt.axvline(mid_price, color='black', linewidth=3, linestyle='--') # Vertical line at the midpoint of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Equal Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mid_price + (mid_price * 0.05), mid_likes + (mid_likes * 0.05), 'Q1', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes + (mid_likes * 0.05), 'Q2', fontsize=14, color='red')
plt.text(mid_price - (mid_price * 0.4), mid_likes - (mid_likes * 0.4), 'Q3', fontsize=14, color='red')
plt.text(mid_price + (mid_price * 0.05), mid_likes - (mid_likes * 0.4), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
# Calculate the mean for current_price and likes_count
mean_price = current_price.mean()
mean_likes = likes_count.mean()
# Create the scatter plot
plt.figure(figsize=(10, 10))
plt.scatter(current_price, likes_count, color='green', alpha=0.3)
# Add lines at the mean values to divide the quadrants
plt.axhline(mean_likes, color='black', linewidth=3, linestyle='--') # Horizontal line at mean of likes
plt.axvline(mean_price, color='black', linewidth=3, linestyle='--') # Vertical line at mean of price
# Add labels and title
plt.title('Scatter Plot of Current Price vs Likes Count with Four Quadrants')
plt.xlabel('Current Price')
plt.ylabel('Likes Count')
# Display the quadrants labels
plt.text(mean_price + (mean_price * 0.1), mean_likes + (mean_likes * 0.1), 'Q1', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes + (mean_likes * 0.1), 'Q2', fontsize=14, color='red')
plt.text(mean_price - (mean_price * 0.5), mean_likes - (mean_likes * 0.5), 'Q3', fontsize=14, color='red')
plt.text(mean_price + (mean_price * 0.1), mean_likes - (mean_likes * 0.5), 'Q4', fontsize=14, color='red')
# Show the plot
plt.show()
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Select features for clustering (price and discount)
X = test_set_men[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=5, random_state=42)
test_set_men['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_men.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
# Assuming 'data_cleaned' is your DataFrame
# Select features for clustering (price and discount)
X = test_set_men[['current_price', 'discount']]
# Perform K-Means clustering
kmeans = KMeans(n_clusters=4, random_state=42) # Adjust n_clusters as needed
test_set_men['cluster'] = kmeans.fit_predict(X)
# Calculate average likes per cluster
cluster_likes = test_set_men.groupby('cluster')['likes_count'].mean()
# Plot average likes per cluster
plt.figure(figsize=(10, 6))
sns.barplot(x=cluster_likes.index, y=cluster_likes.values, palette='Set2')
plt.title('Average Likes per Cluster (Based on Price and Discount)')
plt.xlabel('Cluster')
plt.ylabel('Average Likes')
plt.show()
correlations = {
'Category': ['Accessories', 'Bags', 'Beauty', 'House', 'Jewelry', 'Kids', 'Shoes', 'Women', 'Men'],
'Correlation with Likes Count': [
test_set_accessories['discount'].corr(test_set_accessories['likes_count']),
test_set_bags['discount'].corr(test_set_bags['likes_count']),
test_set_beauty['discount'].corr(test_set_beauty['likes_count']),
test_set_house['discount'].corr(test_set_house['likes_count']),
test_set_jewelry['discount'].corr(test_set_jewelry['likes_count']),
test_set_kids['discount'].corr(test_set_kids['likes_count']),
test_set_shoes['discount'].corr(test_set_shoes['likes_count']),
test_set_women['discount'].corr(test_set_women['likes_count']),
test_set_men['discount'].corr(test_set_men['likes_count'])
]
}
# Create a DataFrame and round the correlation values to 2 decimal places
correlation_df = pd.DataFrame(correlations)
correlation_df['Correlation with Likes Count'] = correlation_df['Correlation with Likes Count'].round(2)
# Display the DataFrame as a table
print(correlation_df)
Category Correlation with Likes Count 0 Accessories -0.09 1 Bags -0.08 2 Beauty -0.14 3 House -0.18 4 Jewelry -0.21 5 Kids 0.07 6 Shoes -0.02 7 Women 0.07 8 Men -0.22
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Dictionary of datasets
datasets = {
'Accessories': test_set_accessories,
'Bags': test_set_bags,
'Beauty': test_set_beauty,
'House': test_set_house,
'Jewelry': test_set_jewelry,
'Kids': test_set_kids,
'Shoes': test_set_shoes,
'Women': test_set_women,
'Men': test_set_men
}
# Loop through each dataset to train and evaluate a Random Forest model
for dataset_name, dataset in datasets.items():
# Drop rows with missing values in the columns of interest
dataset = dataset[['discount', 'likes_count']].dropna()
# Check if there are enough data points
if len(dataset) < 10:
print(f"{dataset_name}: Not enough data for training.")
continue
# Split the data into features and target
X = dataset[['discount']] # Feature
y = dataset['likes_count'] # Target
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize and train the Random Forest model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Calculate and print performance metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"{dataset_name} - Mean Squared Error: {mse:.2f}, R^2 Score: {r2:.2f}")
Accessories - Mean Squared Error: 2740.65, R^2 Score: -0.08 Bags - Mean Squared Error: 14041.80, R^2 Score: -0.11 Beauty - Mean Squared Error: 17711.76, R^2 Score: -0.02 House - Mean Squared Error: 13511.00, R^2 Score: 0.01 Jewelry - Mean Squared Error: 15059.38, R^2 Score: 0.12 Kids - Mean Squared Error: 4806.38, R^2 Score: 0.04 Shoes - Mean Squared Error: 24012.94, R^2 Score: 0.03 Women - Mean Squared Error: 24338.87, R^2 Score: 0.04 Men - Mean Squared Error: 13952.36, R^2 Score: 0.00